import os
import re
from youtube_transcript_api import YouTubeTranscriptApi

from pipelines.prompta.rag.chunk import count_tokens
from pipelines.prompta.rag.collect_wiki import get_final_url
from prompta.utils.set_api import global_openai_client


def find_youtube_vids(text):
    # Define the regex pattern to match the YouTube tags
    pattern = r'\{\{[Yy][Tt]\|([^\}]+)\}\}'
    # Find all occurrences of the pattern
    matches = re.findall(pattern, text)
    if not matches:
        return None
    video_ids = []
    for video_id in matches:
        if 'http' in video_id:
            url = extract_urls(content)[0]
            final_url = get_final_url(url)
            video_id = final_url.split("?v=")[-1]
        else:
            url = f"https://www.youtube.com/watch?v={video_id}"
            final_url = get_final_url(url)
            video_id = final_url.split("?v=")[-1]
        if len(video_id) > 11:
            video_id = video_id[:11]
        video_ids.append(video_id)
    return video_ids


# Function to get transcript
def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return ' '.join([entry['text'] for entry in transcript])
    except Exception as e:
        return f"Transcript not available for {video_id}. Error: {e}"
    

def replace_youtube_tags_with_transcripts(text, transcripts):
    # Define the regex pattern to match the YouTube tags
    pattern = r'(\{\{[Yy][Tt]\|[^\}]+\}\})'
    
    def replacement(match):
        video_id = match.group(1)
        return transcripts.get(video_id, '')
    
    # Replace the patterns with the transcripts
    replaced_text = re.sub(pattern, replacement, text)
    return replaced_text


def rewrite_transcript(client, transcript):
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"Rewrite the following transcript to improve its formality and correctness:\n\n{transcript}"}
            ],
            temperature=0.7,
        )
        return response.choices[0].message['content'].strip()
    except Exception as e:
        return f"Error in rewriting transcript: {e}"


def extract_urls(text):
    # Define the regex pattern to match URLs
    pattern = r'(https?://[^\s]+)'
    # Find all matches in the text
    urls = re.findall(pattern, text)
    return urls


if __name__ == "__main__":
    # Create openAI client
    client = global_openai_client
    
    # Path to the folder containing the text files
    folder_path = "./wiki_raw"
    transcripts_save_path = "./transcripts"
    wiki_processed_save_path = "./wiki_processed"
    for dir_path in [transcripts_save_path, wiki_processed_save_path]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        # else:
        #     for filename in os.listdir(dir_path):
        #         os.remove(os.path.join(dir_path, filename))
    exist_youtube_ids = [_.split(".")[0] for _ in os.listdir(transcripts_save_path) if _.endswith(".txt")]

    # Loop through each file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            
            # Read the content of the file
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            youtube_ids = find_youtube_vids(content)
            if youtube_ids:
                transcripts = {video_id: get_transcript(video_id) for video_id in youtube_ids}
                for video_id, transcript in transcripts.items():
                    if video_id in exist_youtube_ids:
                        continue
                    
                    transcript = rewrite_transcript(client, transcript)
                    with open(os.path.join(transcripts_save_path, video_id + ".txt"), 'w', encoding='utf-8') as file:
                        file.write(transcript)
                    transcripts[video_id] = transcript
                    print(f"Transcript rewritten: {video_id}")

                content = replace_youtube_tags_with_transcripts(content, transcripts)

            # Save the content to the file
            with open(os.path.join(wiki_processed_save_path, filename), 'w', encoding='utf-8') as file:
                file.write(content)
            print(f"File processed: {filename}")
        exist_youtube_ids = [_.split(".")[0] for _ in os.listdir(transcripts_save_path) if _.endswith(".txt")]

